topGO: Bioconductor, Paper

topGO (with enrichment_barplot function): GitHub

Demo Dataset: E-MTAB-8411 from The clock gene Bmal1 inhibits macrophage motility, phagocytosis, and impairs defense against pneumonia. PNAS. 2020;117(3):1543-1551.

License: GPL-3.0

Start R

cd /ngs/GO-Enrichment-Analysis-Demo

R

Load package and set path

library("topGO")
library("org.Mm.eg.db")
library("ggplot2")

Load data

If you have downloaded the DESeq2_DEG.txt file with wget:

data <- data.table::fread("DESeq2_DEG.txt")
data$GeneID <- substr(data$GeneID, 1, 18)

If you like to donwload the file in R now:

data <- data.table::fread("https://raw.githubusercontent.com/ycl6/GO-Enrichment-Analysis-Demo/master/DESeq2_DEG.txt")
data$GeneID <- substr(data$GeneID, 1, 18)
data
##                    GeneID GeneSymbol      log2fc    pvalue      padj
##     1: ENSMUSG00000000001      Gnai3  0.08804493 0.1925609 0.6146732
##     2: ENSMUSG00000000003       Pbsn          NA        NA        NA
##     3: ENSMUSG00000000028      Cdc45 -0.32106635 0.1401127 0.5331437
##     4: ENSMUSG00000000031        H19 -1.20339889 0.7161464        NA
##     5: ENSMUSG00000000037      Scml2 -0.57746426 0.2979159        NA
##    ---                                                              
## 55381: ENSMUSG00000118636 AC117663.3          NA        NA        NA
## 55382: ENSMUSG00000118637 AL772212.1          NA        NA        NA
## 55383: ENSMUSG00000118638 AL805980.1          NA        NA        NA
## 55384: ENSMUSG00000118639 AL590997.4          NA        NA        NA
## 55385: ENSMUSG00000118640 AC167036.2  0.06415390 0.9208414        NA

Define significance threshold

up.idx <- which(data$padj < 0.05 & data$log2fc > 0) # FDR < 0.05 and logFC > 0
dn.idx <- which(data$padj < 0.05 & data$log2fc < 0) # FDR < 0.05 and logFC < 0
dim(data)
## [1] 55385     5
length(up.idx)
## [1] 383
length(dn.idx)
## [1] 429

Define significant genes

all.genes <- data$GeneSymbol
up.genes <- data[up.idx,]$GeneSymbol
dn.genes <- data[dn.idx,]$GeneSymbol
head(up.genes, 10)
##  [1] "Axin2"  "Hnrnpd" "Kcnn3"  "Mapk7"  "Agpat3" "Sema6b" "Efnb2"  "Il16"   "Ltbp1" 
## [10] "Rgs19"
head(dn.genes, 10)
##  [1] "Cox5a"  "Pdgfb"  "Itga5"  "Cd52"   "Dnmt3l" "Tubb6"  "Ell2"   "Ifrd1"  "Stk38l"
## [10] "Ubl3"

Alternatively, if you only have Ensembl gene ID

all.genes <- data$GeneID
up.genes <- data[up.idx,]$GeneID
dn.genes <- data[dn.idx,]$GeneID

Decide the sub-ontology to test

  • BP: Biological Process
  • CC: Cellular Component
  • MF: Molecular Function
ontology <- "BP"

Decide test algorithm

The default algorithm used by the topGO package is weight01, it is a mixture between the elim and the weight algorithms. Possible choices includes:

  • classic
  • elim
  • weight
  • weight01
  • lea
  • parentchild
algorithm <- "weight01"

Define the statistical test used

For tests based on gene counts:

  • Fischer’s exact test (statistic = "fisher")

For tests based on gene scores or gene ranks:

  • Kolmogorov-Smirnov test (statistic = "ks")
  • t-test (statistic = "t")

For tests based on gene expression:

  • globaltest (statistic = "globaltest")
statistic <- "fisher"

Set outfile prefix

outTitle <- paste0("topGO_GO-", ontology, "_ORA_", algorithm,"_", statistic)
outTitle
## [1] "topGO_GO-BP_ORA_weight01_fisher"

Prepare input data

We use all the annotated genes (all.genes) as the “gene universe” or geneList and the differentially expressed genes (up.genes and dn.genes) as our genes of interest. So, we will define the geneList as 1 if a gene is differentially expressed, 0 otherwise

upList <- factor(as.integer(all.genes %in% up.genes))
names(upList) <- all.genes

head(upList, 30)
##    Gnai3     Pbsn    Cdc45      H19    Scml2     Apoh     Narf     Cav2     Klf6    Scmh1 
##        0        0        0        0        0        0        0        0        0        0 
##    Cox5a     Tbx2     Tbx4     Zfy2     Ngfr     Wnt3    Wnt9a      Fer     Xpo6     Tfe3 
##        0        0        0        0        0        0        0        0        0        0 
##    Axin2    Brat1    Gna12 Slc22a18   Itgb2l    Igsf5   Pih1d2     Dlat     Sdhd    Fgf23 
##        1        0        0        0        0        0        0        0        0        0 
## Levels: 0 1
table(upList)
## upList
##     0     1 
## 55002   383
dnList <- factor(as.integer(all.genes %in% dn.genes))
names(dnList) <- all.genes

head(dnList, 30)
##    Gnai3     Pbsn    Cdc45      H19    Scml2     Apoh     Narf     Cav2     Klf6    Scmh1 
##        0        0        0        0        0        0        0        0        0        0 
##    Cox5a     Tbx2     Tbx4     Zfy2     Ngfr     Wnt3    Wnt9a      Fer     Xpo6     Tfe3 
##        1        0        0        0        0        0        0        0        0        0 
##    Axin2    Brat1    Gna12 Slc22a18   Itgb2l    Igsf5   Pih1d2     Dlat     Sdhd    Fgf23 
##        0        0        0        0        0        0        0        0        0        0 
## Levels: 0 1
table(dnList)
## dnList
##     0     1 
## 54956   429

Create topGOdata object

Here, our geneList is named with gene symbol, hence we use ID = "SYMBOL". If geneList is named with Ensembl gene ID, you need to use ID = "ENSEMBL"

upGOdata <- new("topGOdata", ontology = ontology, allGenes = upList,geneSel = function(x)(x == 1), 
        nodeSize = 10, annot = annFUN.org, mapping = "org.Mm.eg.db", ID = "SYMBOL")
## 
## Building most specific GOs .....
##  ( 12221 GO terms found. )
## 
## Build GO DAG topology ..........
##  ( 16035 GO terms and 38105 relations. )
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
## 
## Annotating nodes ...............
##  ( 22070 genes annotated to the GO terms. )
dnGOdata <- new("topGOdata", ontology = ontology, allGenes = dnList,geneSel = function(x)(x == 1), 
        nodeSize = 10, annot = annFUN.org, mapping = "org.Mm.eg.db", ID = "SYMBOL")
## 
## Building most specific GOs .....
##  ( 12221 GO terms found. )
## 
## Build GO DAG topology ..........
##  ( 16035 GO terms and 38105 relations. )
## 
## Annotating nodes ...............
##  ( 22070 genes annotated to the GO terms. )

Test for enrichment

upRes <- runTest(upGOdata, algorithm = algorithm, statistic = statistic)
## 
##           -- Weight01 Algorithm -- 
## 
##       the algorithm is scoring 4200 nontrivial nodes
##       parameters: 
##           test statistic: fisher
## 
##   Level 17:  1 nodes to be scored    (0 eliminated genes)
## 
##   Level 16:  8 nodes to be scored    (0 eliminated genes)
## 
##   Level 15:  18 nodes to be scored   (27 eliminated genes)
## 
##   Level 14:  52 nodes to be scored   (228 eliminated genes)
## 
##   Level 13:  86 nodes to be scored   (638 eliminated genes)
## 
##   Level 12:  174 nodes to be scored  (1663 eliminated genes)
## 
##   Level 11:  280 nodes to be scored  (4019 eliminated genes)
## 
##   Level 10:  445 nodes to be scored  (5764 eliminated genes)
## 
##   Level 9:   586 nodes to be scored  (7811 eliminated genes)
## 
##   Level 8:   652 nodes to be scored  (9851 eliminated genes)
## 
##   Level 7:   671 nodes to be scored  (11778 eliminated genes)
## 
##   Level 6:   555 nodes to be scored  (13641 eliminated genes)
## 
##   Level 5:   366 nodes to be scored  (15859 eliminated genes)
## 
##   Level 4:   195 nodes to be scored  (16672 eliminated genes)
## 
##   Level 3:   90 nodes to be scored   (17019 eliminated genes)
## 
##   Level 2:   20 nodes to be scored   (17376 eliminated genes)
## 
##   Level 1:   1 nodes to be scored    (17538 eliminated genes)
upRes
## 
## Description:  
## Ontology: BP 
## 'weight01' algorithm with the 'fisher' test
## 6778 GO terms scored: 44 terms with p < 0.01
## Annotation data:
##     Annotated genes: 22109 
##     Significant genes: 345 
##     Min. no. of genes annotated to a GO: 10 
##     Nontrivial nodes: 4200
dnRes <- runTest(dnGOdata, algorithm = algorithm, statistic = statistic)
## 
##           -- Weight01 Algorithm -- 
## 
##       the algorithm is scoring 4408 nontrivial nodes
##       parameters: 
##           test statistic: fisher
## 
##   Level 17:  6 nodes to be scored    (0 eliminated genes)
## 
##   Level 16:  11 nodes to be scored   (0 eliminated genes)
## 
##   Level 15:  23 nodes to be scored   (131 eliminated genes)
## 
##   Level 14:  64 nodes to be scored   (262 eliminated genes)
## 
##   Level 13:  111 nodes to be scored  (681 eliminated genes)
## 
##   Level 12:  177 nodes to be scored  (1670 eliminated genes)
## 
##   Level 11:  301 nodes to be scored  (4158 eliminated genes)
## 
##   Level 10:  476 nodes to be scored  (5769 eliminated genes)
## 
##   Level 9:   620 nodes to be scored  (7988 eliminated genes)
## 
##   Level 8:   690 nodes to be scored  (10103 eliminated genes)
## 
##   Level 7:   665 nodes to be scored  (11990 eliminated genes)
## 
##   Level 6:   570 nodes to be scored  (13797 eliminated genes)
## 
##   Level 5:   380 nodes to be scored  (14831 eliminated genes)
## 
##   Level 4:   201 nodes to be scored  (16690 eliminated genes)
## 
##   Level 3:   93 nodes to be scored   (17028 eliminated genes)
## 
##   Level 2:   19 nodes to be scored   (17258 eliminated genes)
## 
##   Level 1:   1 nodes to be scored    (17551 eliminated genes)
dnRes
## 
## Description:  
## Ontology: BP 
## 'weight01' algorithm with the 'fisher' test
## 6778 GO terms scored: 94 terms with p < 0.01
## Annotation data:
##     Annotated genes: 22109 
##     Significant genes: 389 
##     Min. no. of genes annotated to a GO: 10 
##     Nontrivial nodes: 4408

Plot enrichment

Up-regulated genes

png(paste0(outTitle, "_up.png"), width = 8, height = 6, units = "in", res = 300)
enrichment_barplot(upGOdata, upRes, showTerms = 20, numChar = 50, orderBy = "Scores", 
           title = paste0("GO-", ontology," ORA of up-regulated genes"))
invisible(dev.off())
"topGO_GO-BP_ORA_weight01_fisher_up.png"

“topGO_GO-BP_ORA_weight01_fisher_up.png”

Down-regulated genes

png(paste0(outTitle, "_dn.png"), width = 8, height = 6, units = "in", res = 300)
enrichment_barplot(dnGOdata, dnRes, showTerms = 20, numChar = 50, orderBy = "Scores",
                   title = paste0("GO-", ontology," ORA of down-regulated genes"))
invisible(dev.off())
"topGO_GO-BP_ORA_weight01_fisher_dn.png"

“topGO_GO-BP_ORA_weight01_fisher_dn.png”

Create result table

Here, we retrieve test results of top 20 GO terms, you can use topNodes = length(usedGO(GOdata_Obj)) to retrieve results of all available GO terms

up.tab <- GenTable(upGOdata, Pval = upRes, topNodes = 20)
up.tab
##         GO.ID                                        Term Annotated Significant Expected
## 1  GO:0090022         regulation of neutrophil chemotaxis        31           5     0.48
## 2  GO:0042754     negative regulation of circadian rhythm        18           4     0.28
## 3  GO:0097278           complement-dependent cytotoxicity        11           3     0.17
## 4  GO:0071732           cellular response to nitric oxide        11           3     0.17
## 5  GO:0042268                     regulation of cytolysis        12           3     0.19
## 6  GO:0050901              leukocyte tethering or rolling        27           4     0.42
## 7  GO:0072659     protein localization to plasma membrane       278          11     4.34
## 8  GO:0010718 positive regulation of epithelial to mes...        47           5     0.73
## 9  GO:0071380 cellular response to prostaglandin E sti...        13           3     0.20
## 10 GO:0031668 cellular response to extracellular stimu...       222           8     3.46
## 11 GO:0021932 hindbrain radial glia guided cell migrat...        14           3     0.22
## 12 GO:0043299                     leukocyte degranulation        67           4     1.05
## 13 GO:0072672                    neutrophil extravasation        15           3     0.23
## 14 GO:0016045                      detection of bacterium        16           3     0.25
## 15 GO:0050830 defense response to Gram-positive bacter...       113           7     1.76
## 16 GO:0070371                       ERK1 and ERK2 cascade       322          13     5.02
## 17 GO:0061158        3'-UTR-mediated mRNA destabilization        17           3     0.27
## 18 GO:0030574                  collagen catabolic process        36           4     0.56
## 19 GO:0002820 negative regulation of adaptive immune r...        47           3     0.73
## 20 GO:0002526                 acute inflammatory response       105           5     1.64
##        Pval
## 1  0.000072
## 2   0.00015
## 3   0.00057
## 4   0.00057
## 5   0.00075
## 6   0.00077
## 7   0.00077
## 8   0.00081
## 9   0.00096
## 10  0.00101
## 11  0.00121
## 12  0.00142
## 13  0.00149
## 14  0.00181
## 15  0.00199
## 16  0.00208
## 17  0.00218
## 18  0.00231
## 19  0.00235
## 20  0.00257
dn.tab <- GenTable(dnGOdata, Pval = dnRes, topNodes = 20)
dn.tab
##         GO.ID                                        Term Annotated Significant Expected
## 1  GO:0051092 positive regulation of NF-kappaB transcr...       128          12     2.25
## 2  GO:0045944 positive regulation of transcription by ...      1221          46    21.48
## 3  GO:0002755 MyD88-dependent toll-like receptor signa...        20           5     0.35
## 4  GO:0006750            glutathione biosynthetic process        12           4     0.21
## 5  GO:1902041 regulation of extrinsic apoptotic signal...        47           8     0.83
## 6  GO:0006954                       inflammatory response       671          37    11.81
## 7  GO:0055114                 oxidation-reduction process       860          31    15.13
## 8  GO:0032946 positive regulation of mononuclear cell ...       138           5     2.43
## 9  GO:0071277            cellular response to calcium ion        77           7     1.35
## 10 GO:0030168                         platelet activation        76           8     1.34
## 11 GO:0009314                       response to radiation       393          13     6.91
## 12 GO:0034599       cellular response to oxidative stress       268          17     4.72
## 13 GO:0034143 regulation of toll-like receptor 4 signa...        20           4     0.35
## 14 GO:0031954 positive regulation of protein autophosp...        23           4     0.40
## 15 GO:0030031                    cell projection assembly       501          17     8.81
## 16 GO:0034101                     erythrocyte homeostasis       138           7     2.43
## 17 GO:0150079 negative regulation of neuroinflammatory...        11           3     0.19
## 18 GO:0034135 regulation of toll-like receptor 2 signa...        11           3     0.19
## 19 GO:0000050                                  urea cycle        11           3     0.19
## 20 GO:0035733            hepatic stellate cell activation        11           3     0.19
##         Pval
## 1  0.0000028
## 2  0.0000168
## 3  0.0000205
## 4  0.0000418
## 5  0.0000509
## 6  0.0000601
## 7    0.00027
## 8    0.00031
## 9    0.00041
## 10   0.00052
## 11   0.00058
## 12   0.00058
## 13   0.00059
## 14   0.00064
## 15   0.00077
## 16   0.00079
## 17   0.00080
## 18   0.00080
## 19   0.00080
## 20   0.00080
# Update table with full GO term name
up.tab$Term <- sapply(up.tab$"GO.ID", function(go) Term(GO.db::GOTERM[[go]]))
up.tab$Term
##  [1] "regulation of neutrophil chemotaxis"                        
##  [2] "negative regulation of circadian rhythm"                    
##  [3] "complement-dependent cytotoxicity"                          
##  [4] "cellular response to nitric oxide"                          
##  [5] "regulation of cytolysis"                                    
##  [6] "leukocyte tethering or rolling"                             
##  [7] "protein localization to plasma membrane"                    
##  [8] "positive regulation of epithelial to mesenchymal transition"
##  [9] "cellular response to prostaglandin E stimulus"              
## [10] "cellular response to extracellular stimulus"                
## [11] "hindbrain radial glia guided cell migration"                
## [12] "leukocyte degranulation"                                    
## [13] "neutrophil extravasation"                                   
## [14] "detection of bacterium"                                     
## [15] "defense response to Gram-positive bacterium"                
## [16] "ERK1 and ERK2 cascade"                                      
## [17] "3'-UTR-mediated mRNA destabilization"                       
## [18] "collagen catabolic process"                                 
## [19] "negative regulation of adaptive immune response"            
## [20] "acute inflammatory response"
dn.tab$Term <- sapply(dn.tab$"GO.ID", function(go) Term(GO.db::GOTERM[[go]]))
dn.tab$Term
##  [1] "positive regulation of NF-kappaB transcription factor activity"                
##  [2] "positive regulation of transcription by RNA polymerase II"                     
##  [3] "MyD88-dependent toll-like receptor signaling pathway"                          
##  [4] "glutathione biosynthetic process"                                              
##  [5] "regulation of extrinsic apoptotic signaling pathway via death domain receptors"
##  [6] "inflammatory response"                                                         
##  [7] "oxidation-reduction process"                                                   
##  [8] "positive regulation of mononuclear cell proliferation"                         
##  [9] "cellular response to calcium ion"                                              
## [10] "platelet activation"                                                           
## [11] "response to radiation"                                                         
## [12] "cellular response to oxidative stress"                                         
## [13] "regulation of toll-like receptor 4 signaling pathway"                          
## [14] "positive regulation of protein autophosphorylation"                            
## [15] "cell projection assembly"                                                      
## [16] "erythrocyte homeostasis"                                                       
## [17] "negative regulation of neuroinflammatory response"                             
## [18] "regulation of toll-like receptor 2 signaling pathway"                          
## [19] "urea cycle"                                                                    
## [20] "hepatic stellate cell activation"

Add gene symbol to result table

The GenTable function only provide the significant gene count for each significant GO term, here we will add the gene symbols to the result table

# Obtain the list of significant genes
up.sigGenes <- sigGenes(upGOdata)
dn.sigGenes <- sigGenes(dnGOdata)

# Retrieve gene symbols for each GO from the test result
up.AnnoList <- lapply(up.tab$"GO.ID", 
              function(x) as.character(unlist(genesInTerm(object = upGOdata, whichGO = x))))
dn.AnnoList <- lapply(dn.tab$"GO.ID", 
              function(x) as.character(unlist(genesInTerm(object = dnGOdata, whichGO = x))))

up.SigList <- lapply(up.AnnoList, function(x) intersect(x, up.sigGenes))
dn.SigList <- lapply(dn.AnnoList, function(x) intersect(x, dn.sigGenes))

# Coerce gene list to a comma-separated vector
up.tab$Genes <- sapply(up.SigList, paste, collapse = ",")
dn.tab$Genes <- sapply(dn.SigList, paste, collapse = ",")
cbind(head(up.tab$Genes, 5))
##      [,1]                        
## [1,] "Bst1,C5ar2,Lbp,Nod2,Ripor2"
## [2,] "Cry1,Cry2,Per2,Ptger4"     
## [3,] "Cd55,Cfh,Rab27a"           
## [4,] "Bcar1,Hnrnpd,Mtr"          
## [5,] "Cfh,Lbp,Pglyrp1"
cbind(head(dn.tab$Genes, 5))
##      [,1]                                                                                                                                                                                                                                                                    
## [1,] "Cat,Ikbkg,Irak2,Lrrfip2,Malt1,Mtpn,Prkcb,Rab7b,Ticam1,Tlr2,Traf1,Trim8"                                                                                                                                                                                                
## [2,] "Arhgef10l,Arntl,Atf3,Cdkn2a,Cebpa,Dbp,Eaf1,Egr2,Elk3,Ell2,Fosb,Hexb,Htatip2,Ier2,Ier5,Ifrd1,Ikbkg,Irf5,Jag1,Junb,Jund,Kdm6b,Klf9,Maff,Mafg,Mitf,Nfe2l2,Nfkb1,Nfkb2,Nfya,Nr1d1,Nr1d2,Nr1h3,Nr4a2,Osm,Pdgfb,Pprc1,Sesn2,Six1,Slc11a1,Thra,Tlr2,Tnip1,Zbtb17,Zfp691,Zmiz2"
## [3,] "Irak2,Lrrfip2,Tlr1,Tlr2,Tnip1"                                                                                                                                                                                                                                         
## [4,] "Gclc,Gclm,Nfe2l2,Slc7a11"                                                                                                                                                                                                                                              
## [5,] "Atf3,Bcl2l1,Fem1b,Lgals3,Serpine1,Skil,Tmbim1,Tmc8"

Output results to files

write.table(up.tab, file = paste0(outTitle, "_up.txt"), sep = "\t", quote = F, 
        row.names = F, col.names = T)

write.table(dn.tab, file = paste0(outTitle, "_dn.txt"), sep = "\t", quote = F, 
        row.names = F, col.names = T)

Session information

sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-conda_cos6-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /home/ihsuan/miniconda3/envs/r4/lib/libopenblasp-r0.3.10.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C               LC_TIME=en_GB.UTF-8       
##  [4] LC_COLLATE=en_GB.UTF-8     LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                  LC_ADDRESS=C              
## [10] LC_TELEPHONE=C             LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets  methods  
## [9] base     
## 
## other attached packages:
##  [1] SparseM_1.78         ggplot2_3.3.2        org.Mm.eg.db_3.11.4  AnnotationDbi_1.50.3
##  [5] IRanges_2.22.2       S4Vectors_0.26.1     Biobase_2.48.0       topGO_2.41.0        
##  [9] graph_1.66.0         BiocGenerics_0.34.0  knitr_1.29          
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.5         highr_0.8          pillar_1.4.6       compiler_4.0.2    
##  [5] tools_4.0.2        digest_0.6.25      bit_4.0.4          tibble_3.0.3      
##  [9] RSQLite_2.2.0      evaluate_0.14      memoise_1.1.0      lifecycle_0.2.0   
## [13] gtable_0.3.0       lattice_0.20-41    png_0.1-7          pkgconfig_2.0.3   
## [17] rlang_0.4.7        DBI_1.1.0          yaml_2.2.1         xfun_0.16         
## [21] withr_2.2.0        dplyr_1.0.1        stringr_1.4.0      generics_0.0.2    
## [25] vctrs_0.3.2        tidyselect_1.1.0   bit64_4.0.2        grid_4.0.2        
## [29] data.table_1.13.0  glue_1.4.1         R6_2.4.1           rmarkdown_2.3     
## [33] farver_2.0.3       purrr_0.3.4        GO.db_3.11.4       blob_1.2.1        
## [37] magrittr_1.5       ellipsis_0.3.1     scales_1.1.1       htmltools_0.5.0   
## [41] matrixStats_0.56.0 colorspace_1.4-1   labeling_0.3       stringi_1.4.6     
## [45] munsell_0.5.0      crayon_1.3.4